{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 44.机器学习"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![jiqixuexi.png](jiqixuexi.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.1机器学习的业务理解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.2 数据读入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\Desktop\\代码 40-47\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "os.chdir(r'C:\\\\Users\\\\Administrator\\\\Desktop\\\\代码 40-47')\n",
    "  \n",
    "print(os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>diagnosis</th>\n",
       "      <th>radius_mean</th>\n",
       "      <th>texture_mean</th>\n",
       "      <th>perimeter_mean</th>\n",
       "      <th>area_mean</th>\n",
       "      <th>smoothness_mean</th>\n",
       "      <th>compactness_mean</th>\n",
       "      <th>concavity_mean</th>\n",
       "      <th>concave points_mean</th>\n",
       "      <th>...</th>\n",
       "      <th>radius_worst</th>\n",
       "      <th>texture_worst</th>\n",
       "      <th>perimeter_worst</th>\n",
       "      <th>area_worst</th>\n",
       "      <th>smoothness_worst</th>\n",
       "      <th>compactness_worst</th>\n",
       "      <th>concavity_worst</th>\n",
       "      <th>concave_points_worst</th>\n",
       "      <th>symmetry_worst</th>\n",
       "      <th>fractal_dimension_worst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>842302</td>\n",
       "      <td>M</td>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>...</td>\n",
       "      <td>25.38</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>842517</td>\n",
       "      <td>M</td>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>...</td>\n",
       "      <td>24.99</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>84300903</td>\n",
       "      <td>M</td>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>...</td>\n",
       "      <td>23.57</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>84348301</td>\n",
       "      <td>M</td>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>...</td>\n",
       "      <td>14.91</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>84358402</td>\n",
       "      <td>M</td>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>...</td>\n",
       "      <td>22.54</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \\\n",
       "0    842302         M        17.99         10.38          122.80     1001.0   \n",
       "1    842517         M        20.57         17.77          132.90     1326.0   \n",
       "2  84300903         M        19.69         21.25          130.00     1203.0   \n",
       "3  84348301         M        11.42         20.38           77.58      386.1   \n",
       "4  84358402         M        20.29         14.34          135.10     1297.0   \n",
       "\n",
       "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n",
       "0          0.11840           0.27760          0.3001              0.14710   \n",
       "1          0.08474           0.07864          0.0869              0.07017   \n",
       "2          0.10960           0.15990          0.1974              0.12790   \n",
       "3          0.14250           0.28390          0.2414              0.10520   \n",
       "4          0.10030           0.13280          0.1980              0.10430   \n",
       "\n",
       "            ...             radius_worst  texture_worst  perimeter_worst  \\\n",
       "0           ...                    25.38          17.33           184.60   \n",
       "1           ...                    24.99          23.41           158.80   \n",
       "2           ...                    23.57          25.53           152.50   \n",
       "3           ...                    14.91          26.50            98.87   \n",
       "4           ...                    22.54          16.67           152.20   \n",
       "\n",
       "   area_worst  smoothness_worst  compactness_worst  concavity_worst  \\\n",
       "0      2019.0            0.1622             0.6656           0.7119   \n",
       "1      1956.0            0.1238             0.1866           0.2416   \n",
       "2      1709.0            0.1444             0.4245           0.4504   \n",
       "3       567.7            0.2098             0.8663           0.6869   \n",
       "4      1575.0            0.1374             0.2050           0.4000   \n",
       "\n",
       "   concave_points_worst  symmetry_worst  fractal_dimension_worst  \n",
       "0                0.2654          0.4601                  0.11890  \n",
       "1                0.1860          0.2750                  0.08902  \n",
       "2                0.2430          0.3613                  0.08758  \n",
       "3                0.2575          0.6638                  0.17300  \n",
       "4                0.1625          0.2364                  0.07678  \n",
       "\n",
       "[5 rows x 32 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bc_data = pd.read_csv('bc_data.csv', header=0)\n",
    "\n",
    "bc_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.3 数据理解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(569, 32)\n"
     ]
    }
   ],
   "source": [
    "print(bc_data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',\n",
      "       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',\n",
      "       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',\n",
      "       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',\n",
      "       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',\n",
      "       'fractal_dimension_se', 'radius_worst', 'texture_worst',\n",
      "       'perimeter_worst', 'area_worst', 'smoothness_worst',\n",
      "       'compactness_worst', 'concavity_worst', 'concave_points_worst',\n",
      "       'symmetry_worst', 'fractal_dimension_worst'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "print(bc_data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \\\n",
      "count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   \n",
      "mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   \n",
      "std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   \n",
      "min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   \n",
      "25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   \n",
      "50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   \n",
      "75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   \n",
      "max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   \n",
      "\n",
      "       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n",
      "count       569.000000        569.000000      569.000000           569.000000   \n",
      "mean          0.096360          0.104341        0.088799             0.048919   \n",
      "std           0.014064          0.052813        0.079720             0.038803   \n",
      "min           0.052630          0.019380        0.000000             0.000000   \n",
      "25%           0.086370          0.064920        0.029560             0.020310   \n",
      "50%           0.095870          0.092630        0.061540             0.033500   \n",
      "75%           0.105300          0.130400        0.130700             0.074000   \n",
      "max           0.163400          0.345400        0.426800             0.201200   \n",
      "\n",
      "       symmetry_mean           ...             radius_worst  texture_worst  \\\n",
      "count     569.000000           ...               569.000000     569.000000   \n",
      "mean        0.181162           ...                16.269190      25.677223   \n",
      "std         0.027414           ...                 4.833242       6.146258   \n",
      "min         0.106000           ...                 7.930000      12.020000   \n",
      "25%         0.161900           ...                13.010000      21.080000   \n",
      "50%         0.179200           ...                14.970000      25.410000   \n",
      "75%         0.195700           ...                18.790000      29.720000   \n",
      "max         0.304000           ...                36.040000      49.540000   \n",
      "\n",
      "       perimeter_worst   area_worst  smoothness_worst  compactness_worst  \\\n",
      "count       569.000000   569.000000        569.000000         569.000000   \n",
      "mean        107.261213   880.583128          0.132369           0.254265   \n",
      "std          33.602542   569.356993          0.022832           0.157336   \n",
      "min          50.410000   185.200000          0.071170           0.027290   \n",
      "25%          84.110000   515.300000          0.116600           0.147200   \n",
      "50%          97.660000   686.500000          0.131300           0.211900   \n",
      "75%         125.400000  1084.000000          0.146000           0.339100   \n",
      "max         251.200000  4254.000000          0.222600           1.058000   \n",
      "\n",
      "       concavity_worst  concave_points_worst  symmetry_worst  \\\n",
      "count       569.000000            569.000000      569.000000   \n",
      "mean          0.272188              0.114606        0.290076   \n",
      "std           0.208624              0.065732        0.061867   \n",
      "min           0.000000              0.000000        0.156500   \n",
      "25%           0.114500              0.064930        0.250400   \n",
      "50%           0.226700              0.099930        0.282200   \n",
      "75%           0.382900              0.161400        0.317900   \n",
      "max           1.252000              0.291000        0.663800   \n",
      "\n",
      "       fractal_dimension_worst  \n",
      "count               569.000000  \n",
      "mean                  0.083946  \n",
      "std                   0.018061  \n",
      "min                   0.055040  \n",
      "25%                   0.071460  \n",
      "50%                   0.080040  \n",
      "75%                   0.092080  \n",
      "max                   0.207500  \n",
      "\n",
      "[8 rows x 31 columns]\n"
     ]
    }
   ],
   "source": [
    "print(bc_data.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.4 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \\\n",
      "0         M        17.99         10.38          122.80     1001.0   \n",
      "1         M        20.57         17.77          132.90     1326.0   \n",
      "2         M        19.69         21.25          130.00     1203.0   \n",
      "3         M        11.42         20.38           77.58      386.1   \n",
      "4         M        20.29         14.34          135.10     1297.0   \n",
      "\n",
      "   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n",
      "0          0.11840           0.27760          0.3001              0.14710   \n",
      "1          0.08474           0.07864          0.0869              0.07017   \n",
      "2          0.10960           0.15990          0.1974              0.12790   \n",
      "3          0.14250           0.28390          0.2414              0.10520   \n",
      "4          0.10030           0.13280          0.1980              0.10430   \n",
      "\n",
      "   symmetry_mean           ...             radius_worst  texture_worst  \\\n",
      "0         0.2419           ...                    25.38          17.33   \n",
      "1         0.1812           ...                    24.99          23.41   \n",
      "2         0.2069           ...                    23.57          25.53   \n",
      "3         0.2597           ...                    14.91          26.50   \n",
      "4         0.1809           ...                    22.54          16.67   \n",
      "\n",
      "   perimeter_worst  area_worst  smoothness_worst  compactness_worst  \\\n",
      "0           184.60      2019.0            0.1622             0.6656   \n",
      "1           158.80      1956.0            0.1238             0.1866   \n",
      "2           152.50      1709.0            0.1444             0.4245   \n",
      "3            98.87       567.7            0.2098             0.8663   \n",
      "4           152.20      1575.0            0.1374             0.2050   \n",
      "\n",
      "   concavity_worst  concave_points_worst  symmetry_worst  \\\n",
      "0           0.7119                0.2654          0.4601   \n",
      "1           0.2416                0.1860          0.2750   \n",
      "2           0.4504                0.2430          0.3613   \n",
      "3           0.6869                0.2575          0.6638   \n",
      "4           0.4000                0.1625          0.2364   \n",
      "\n",
      "   fractal_dimension_worst  \n",
      "0                  0.11890  \n",
      "1                  0.08902  \n",
      "2                  0.08758  \n",
      "3                  0.17300  \n",
      "4                  0.07678  \n",
      "\n",
      "[5 rows x 31 columns]\n"
     ]
    }
   ],
   "source": [
    "data = bc_data.drop(['id'], axis=1)\n",
    "print(data.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>radius_mean</th>\n",
       "      <th>texture_mean</th>\n",
       "      <th>perimeter_mean</th>\n",
       "      <th>area_mean</th>\n",
       "      <th>smoothness_mean</th>\n",
       "      <th>compactness_mean</th>\n",
       "      <th>concavity_mean</th>\n",
       "      <th>concave points_mean</th>\n",
       "      <th>symmetry_mean</th>\n",
       "      <th>fractal_dimension_mean</th>\n",
       "      <th>...</th>\n",
       "      <th>radius_worst</th>\n",
       "      <th>texture_worst</th>\n",
       "      <th>perimeter_worst</th>\n",
       "      <th>area_worst</th>\n",
       "      <th>smoothness_worst</th>\n",
       "      <th>compactness_worst</th>\n",
       "      <th>concavity_worst</th>\n",
       "      <th>concave_points_worst</th>\n",
       "      <th>symmetry_worst</th>\n",
       "      <th>fractal_dimension_worst</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>0.2419</td>\n",
       "      <td>0.07871</td>\n",
       "      <td>...</td>\n",
       "      <td>25.38</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>0.1812</td>\n",
       "      <td>0.05667</td>\n",
       "      <td>...</td>\n",
       "      <td>24.99</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>0.2069</td>\n",
       "      <td>0.05999</td>\n",
       "      <td>...</td>\n",
       "      <td>23.57</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>0.2597</td>\n",
       "      <td>0.09744</td>\n",
       "      <td>...</td>\n",
       "      <td>14.91</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>0.1809</td>\n",
       "      <td>0.05883</td>\n",
       "      <td>...</td>\n",
       "      <td>22.54</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \\\n",
       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
       "3        11.42         20.38           77.58      386.1          0.14250   \n",
       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
       "\n",
       "   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \\\n",
       "0           0.27760          0.3001              0.14710         0.2419   \n",
       "1           0.07864          0.0869              0.07017         0.1812   \n",
       "2           0.15990          0.1974              0.12790         0.2069   \n",
       "3           0.28390          0.2414              0.10520         0.2597   \n",
       "4           0.13280          0.1980              0.10430         0.1809   \n",
       "\n",
       "   fractal_dimension_mean           ...             radius_worst  \\\n",
       "0                 0.07871           ...                    25.38   \n",
       "1                 0.05667           ...                    24.99   \n",
       "2                 0.05999           ...                    23.57   \n",
       "3                 0.09744           ...                    14.91   \n",
       "4                 0.05883           ...                    22.54   \n",
       "\n",
       "   texture_worst  perimeter_worst  area_worst  smoothness_worst  \\\n",
       "0          17.33           184.60      2019.0            0.1622   \n",
       "1          23.41           158.80      1956.0            0.1238   \n",
       "2          25.53           152.50      1709.0            0.1444   \n",
       "3          26.50            98.87       567.7            0.2098   \n",
       "4          16.67           152.20      1575.0            0.1374   \n",
       "\n",
       "   compactness_worst  concavity_worst  concave_points_worst  symmetry_worst  \\\n",
       "0             0.6656           0.7119                0.2654          0.4601   \n",
       "1             0.1866           0.2416                0.1860          0.2750   \n",
       "2             0.4245           0.4504                0.2430          0.3613   \n",
       "3             0.8663           0.6869                0.2575          0.6638   \n",
       "4             0.2050           0.4000                0.1625          0.2364   \n",
       "\n",
       "   fractal_dimension_worst  \n",
       "0                  0.11890  \n",
       "1                  0.08902  \n",
       "2                  0.08758  \n",
       "3                  0.17300  \n",
       "4                  0.07678  \n",
       "\n",
       "[5 rows x 30 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_data = data.drop(['diagnosis'], axis=1)\n",
    "      \n",
    "X_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['M', 'M', 'M', 'M', 'M', 'M'], dtype=object)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_data = np.ravel(data[['diagnosis']])  \n",
    "y_data[0:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\fixes.py:313: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  _nan_object_mask = _nan_object_array != _nan_object_array\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_trainingSet, X_testSet, y_trainingSet, y_testSet = train_test_split(X_data, y_data, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(426, 30)\n"
     ]
    }
   ],
   "source": [
    "print(X_trainingSet.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(143, 30)\n"
     ]
    }
   ],
   "source": [
    "print(X_testSet.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.5 算法选择及其超级参数的设置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "myModel = KNeighborsClassifier(algorithm='kd_tree')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.6 具体模型的训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "           weights='uniform')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myModel.fit(X_trainingSet, y_trainingSet)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.7 用模型进行预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_predictSet = myModel.predict(X_testSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['M' 'M' 'B' 'M' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B'\n",
      " 'B' 'M' 'B' 'B' 'M' 'B' 'M' 'B' 'B' 'M' 'M' 'M' 'M' 'B' 'M' 'B' 'B' 'B'\n",
      " 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B'\n",
      " 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B'\n",
      " 'M' 'M' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'B' 'M' 'B' 'M' 'B'\n",
      " 'B' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'\n",
      " 'M' 'M' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'M'\n",
      " 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'M' 'B' 'M' 'M' 'M' 'B' 'B' 'M' 'M' 'B']\n"
     ]
    }
   ],
   "source": [
    "print(y_predictSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['B' 'M' 'B' 'M' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B'\n",
      " 'B' 'M' 'B' 'B' 'M' 'B' 'M' 'B' 'B' 'M' 'M' 'M' 'M' 'B' 'M' 'M' 'B' 'B'\n",
      " 'M' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B'\n",
      " 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B'\n",
      " 'M' 'M' 'B' 'M' 'M' 'M' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'B' 'M' 'B' 'M' 'B'\n",
      " 'B' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'\n",
      " 'M' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'M'\n",
      " 'M' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'B']\n"
     ]
    }
   ],
   "source": [
    "print(y_testSet)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.8 模型评价"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.937062937063\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "print(accuracy_score(y_testSet, y_predictSet))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 44.9 模型的应用与优化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
